cap log close 
set more off

*-----------*
* READ DATA *
*-----------*
use "$temp\census_raw.dta", clear



*===============================================================================
*===============================================================================
*===============================================================================
*----------------------*
* VARIABLE ENGINEERING *
*----------------------*

gen female = sex ==2

log using "$logs\race_variables.log", replace
* RACE VARIABLES * 
tab race 
tab hispan


replace race=0 if race!=1 // non-white

lab define vrace 0 "Non-white" 1 "White", modify
lab values race vrace

label var race "0=non-white ; 1=white"

log close 

* EDUCATION DUMMY CATEGORIES *
log using "$logs\education_variables.log", replace

tab educd educ 

replace educ = . if educ == 0 // set to missing if NA
replace educ = 0 if educd ==2 // add cat for no school completed
replace educd = . if educd == 1 // set to missing if NA
gen edl = 0 if inrange(educ,0,5) // < 12
replace edl = 1 if inlist(educ,6) // 12
replace edl = 2 if inlist(educ,7,8) // 13-15
replace edl = 3 if inlist(educ,10) // 16
replace edl = 4 if inlist(educ,11) // 17+

lab define vedl 0 "<12" 1 "12" 2 "13-15" 3 "16" 4 "17+", modify
lab values edl vedl 

lab var edl "5-category education variable"
tab educ
tab educ edl

* YEARS OF EDUCATION *
tab educd, gen(edd)

gen edyrs = 9 if edd5 ==1 // grade 9
replace edyrs = 10 if edd6 ==1 // grade 10
replace edyrs = 11 if (edd7 ==1) | (edd8 ==1) // grade 11 or 12th grade no diploma
replace edyrs = 12 if (edd9 ==1) | (edd10 ==1) // HS grad or some college but less than a year
replace edyrs = 13 if edd11 ==1 // 1 or more years coll
replace edyrs = 15 if edd12 ==1 // associate degree
replace edyrs = 16 if edd13 ==1 // college
replace edyrs = 17 if edd14 ==1 // masters
replace edyrs = 19 if edd15 ==1 // professional grad
replace edyrs = 20 if edd16 ==1 // phd

lab var edyrs "Years of completed education"
lab var edl "5-category years of completed education "

drop edd*
tab educd edyrs

log close 

* WAGES *
log using "$logs\wage_construction.log", replace

summ uhrswork,d
replace uhrswork = . if uhrswork ==0
summ uhrswork,d

summ wkswork1,d
replace wkswork1 = . if wkswork1 ==0
summ wkswork1,d

summ incwage,d
replace incwage =. if incwage == 0 | incwage >= 999999 
summ incwage,d

assert inrange(incwage,1,400000) if incwage !=.
assert inrange(uhrswork,1,99) if uhrswork !=.

gen wage = incwage/(wkswork1*uhrswork)
summ wage, d

assert wage > 0
gen lnwage = log(wage)
summ wage lnwage,d

label var wage "wage = incwage/(wkswork1*uhrswork)"
label var lnwage "log(wage)"

log close 

* POTENTIAL EXPERIENCE *
gen exp = age - edyrs - 6
lab var exp "potential experience"

* OCCUPATION INDICATORS *
replace occsoc = "." if occsoc == "000000"
gen occma = substr(occsoc,1,2)
gen occmi = substr(occsoc,1,3)
destring occma , gen(occ_major)
destring occmi , gen(occ_minor)
encode occsoc, gen(occ_det)
drop occma occmi occsoc

label var occ_major "occ major"
label var occ_minor "occ minor"
label var occ_det "Full-detail occ soc"

* INDUSTRY INDICATORS *
replace ind =. if ind == 0 | ind == 992
decode ind, gen(inds)

gen ind1 = .
replace ind1 = 1 if inlist(inds,"0017","0018","0019","0027","0028","0029") // Agriculture, Forestry, Fishing and Hunting

replace ind1 = 2 if inlist(inds,"0037","0038","0039","0047","0048","0049") // mining

replace ind1 = 3 if inlist(inds,"0077") // construction

replace ind1 = 4 if inlist(inds,"0107","0108","0109","0117","0118","0119","0127","0128") | ///
					inlist(inds,"0129","0137","0139","0147","0148","0149","0157","0159") | /// 
					inlist(inds,"0167","0168","0169","0177","0179","0377","0378","0379") | ///
					inlist(inds,"0387","0187","0188","0189","0199","0207","0209","0217") | /// 
					inlist(inds,"0218","0219","0227","0228","0229","0237","0238","0239") | ///
					inlist(inds,"0247","0248","0249","0257","0259","0267","0268","0269") | ///
					inlist(inds,"0277","0278","0279","0287","0288","0289","0297","0298") | ///
					inlist(inds,"0299","0307","0308","0309","0317","0318","0319","0329") |   /// 
					inlist(inds,"0336","0337","0338","0339","0347","0349","0357","0358") |  /// 
					inlist(inds,"0359","0367","0368","0369","0389","0396","0397","0398") |  /// 
					inlist(inds,"0399") // manufacturing

replace ind1 = 5 if inlist(inds,"0407","0408","0409","0417","0418","0419","0426","0427","0428") | inlist(inds,"0429","0437","0438","0439","0447","0448","0449","0456") | inlist(inds,"0457","0458","0459")	// wholesale trade

replace ind1 = 6 if inlist(inds,"0467","0468",	"0469",	"0477",	"0478",	"0479",	"0487",	"0488",	"0489")	|inlist(inds,"0497","0498",	"0499",	"0507",	"0508",	"0509",	"0517",	"0518")	| inlist(inds,"0519","0527","0528",	"0529",	"0537",	"0538",	"0539",	"0547")	| inlist(inds,"0548","0549","0557",	"0558",	"0559","0567","0568","0569","0579")	// Retail Trade

replace ind1 = 7 if inlist(inds,"0607","0608","0609","0617","0618","0619","0627","0628") | inlist(inds,"0638","0639","0629","0637") // Transportation and Warehousing

replace ind1 = 8 if inlist(inds,"0057","0058","0059","0067","0068","0069")	// utilities

replace ind1 = 9 if inlist(inds,"0647","0648","0649","0657","0659","0667","0668","0669") | inlist(inds,"0677","0678","0679") // Information and Communications

replace ind1 = 10 if inlist(inds,"0687","0688","0689","0697","0699") | inlist(inds,"0707","0708","0717","0718",	"0719")	// Finance, Insurance, Real Estate, and Rental and Leasing

replace ind1 = 11 if inlist(inds,"0727","0728","0729","0737","0738","0739","0746","0747") | /// 
					 inlist(inds,"0748","0749","0757","0758","0759","0767","0768","0769") |	///
					 inlist(inds,"0777","0778","0779") // Professional, Scientific, Management, Administrative, and Waste Management Services

replace ind1 = 12 if inlist(inds,"0786","0787","0788","0789","0797","0798","0799","0807") |	inlist(inds,"0808","0809","0817","0818","0819","0827","0829","0837") |	inlist(inds,"0838","0839","0847") // Educational, Health and Social Services

replace ind1 = 13 if inlist(inds,"0856","0857","0858","0859","0866","0867","0868","0869") // Arts, Entertainment, Recreation, Accommodations, and Food Services

replace ind1 = 14 if inlist(inds,"0877","0878","0879","0887","0888","0889","0897","0898") |	inlist(inds,"0899","0907","0908","0909","0916","0917","0918","0919") | inlist(inds,"0929")	// Other Services (Except Public Administration)

replace ind1 = 15 if inlist(inds,"0937","0938","0939","0947","0948","0949","0957","0959") //Public Administration

replace ind1 = 16 if inlist(inds,"0967","0968","0969","0977","0978","0979","0987")	// Armed forces

#delimit ;
lab define vind 1 "Ag/Forest/fish/hunt" 2 "Mining" 3 "Construction" 4 "Manufacturing"
				5 "Wholesale trade" 6 "Retail trade" 7 "Transportion/Warehouse" 
				8 "Utilities" 9 "Info and communication" 10 "Fin/Ins./R.E./Rental/Leasing"
				11 "Prof/Scientific/Manag./Admin/Waste Man" 12 "Educ/Health/Soc. Serv"
				13 "Arts/Ent/Rec/Accom/Food Srv." 14 "Other Services (exclude pub admin)"
				15 "Public administration" 16 "Armed forces", modify ;
#delimit cr
lab values ind1 vind
lab variable ind1 "1-digit industry categories"

* REGION AND DIVISION DUMMIES *
ren region division
tab division, gen(div)

gen region = .
replace region = 1 if div1 ==1 | div2 ==1
replace region = 2 if div3 ==1 | div4 ==1
replace region = 3 if div5 ==1 | div6 ==1 | div7 ==1
replace region = 4 if div8 ==1 | div9 ==1 

lab define vreg 1 "Northeast" 2 "Midwest" 3 "South" 4 "West", modify
lab values region vreg
drop div1-div9

lab var region "Census region"
lab var division "Census division"

*===============================================================================
*===============================================================================
*===============================================================================

log using "$logs\sample_restrictions.log",replace

*---------------------*
* SAMPLE RESTRICTIONS *
*---------------------*

keep if inrange(exp,0,40) 
keep if wkswork1 >= 10
keep if inrange(uhrswork,10,98.9)
keep if inrange(wage,1,150)
keep if educ > 2 // Ensure all workers with less than 9 years of education are dropped

log close
*===============================================================================
*===============================================================================
*===============================================================================

log using "$logs\topcode_report_post_samplerestrictions.log", replace

gen inc_tc = incwage >= 240000 if incwage != .
gen uhrs_tc = uhrswork == 99 if uhrswork != .

* full distribution of inc/ushrswork *
summ incwage uhrswork [aw=perwt],d

* top-coded distribution of inc/ushrswork *
summ incwage [aw=perwt] if inc_tc ==1,d
summ uhrswork [aw=perwt] if uhrs_tc ==1,d

* FRACTION INC/HOURS TOPCODED BY EDL *
bys edl: sum inc_tc uhrs_tc [aw=perwt]

log close 
*===============================================================================
*===============================================================================
*===============================================================================
drop pernum serial hhwt cluster strata inc_tc uhrs_tc sample year inds raced hispan hispand gq educ educd ind sex

order perwt region division statefip female age exp race edl edyrs occ_major occ_minor occ_det ind1  wkswork1 uhrswork incwage wage lnwage

*-----------*
* SAVE DATA *
*-----------*
save "$temp\clean_census_5pct.dta", replace







